{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Examples and Exercises from Think Stats, 2nd Edition\n",
    "\n",
    "http://thinkstats2.com\n",
    "\n",
    "Copyright 2016 Allen B. Downey\n",
    "\n",
    "MIT License: https://opensource.org/licenses/MIT\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'thinkstats2'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-5d823b2f1660>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mthinkstats2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     14\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mthinkplot\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'thinkstats2'"
     ]
    }
   ],
   "source": [
    "from __future__ import print_function, division\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import random\n",
    "\n",
    "import thinkstats2\n",
    "import thinkplot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Time series analysis\n",
    "\n",
    "Load the data from \"Price of Weed\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "transactions = pd.read_csv('mj-clean.csv', parse_dates=[5])\n",
    "transactions.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following function takes a DataFrame of transactions and compute daily averages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def GroupByDay(transactions, func=np.mean):\n",
    "    \"\"\"Groups transactions by day and compute the daily mean ppg.\n",
    "\n",
    "    transactions: DataFrame of transactions\n",
    "\n",
    "    returns: DataFrame of daily prices\n",
    "    \"\"\"\n",
    "    grouped = transactions[['date', 'ppg']].groupby('date')\n",
    "    daily = grouped.aggregate(func)\n",
    "\n",
    "    daily['date'] = daily.index\n",
    "    start = daily.date[0]\n",
    "    one_year = np.timedelta64(1, 'Y')\n",
    "    daily['years'] = (daily.date - start) / one_year\n",
    "\n",
    "    return daily"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following function returns a map from quality name to a DataFrame of daily averages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def GroupByQualityAndDay(transactions):\n",
    "    \"\"\"Divides transactions by quality and computes mean daily price.\n",
    "\n",
    "    transaction: DataFrame of transactions\n",
    "    \n",
    "    returns: map from quality to time series of ppg\n",
    "    \"\"\"\n",
    "    groups = transactions.groupby('quality')\n",
    "    dailies = {}\n",
    "    for name, group in groups:\n",
    "        dailies[name] = GroupByDay(group)        \n",
    "\n",
    "    return dailies"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`dailies` is the map from quality name to DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dailies = GroupByQualityAndDay(transactions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following plots the daily average price for each quality."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "thinkplot.PrePlot(rows=3)\n",
    "for i, (name, daily) in enumerate(dailies.items()):\n",
    "    thinkplot.SubPlot(i+1)\n",
    "    title = 'Price per gram ($)' if i == 0 else ''\n",
    "    thinkplot.Config(ylim=[0, 20], title=title)\n",
    "    thinkplot.Scatter(daily.ppg, s=10, label=name)\n",
    "    if i == 2: \n",
    "        plt.xticks(rotation=30)\n",
    "        thinkplot.Config()\n",
    "    else:\n",
    "        thinkplot.Config(xticks=[])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use `statsmodels` to run a linear model of price as a function of time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.formula.api as smf\n",
    "\n",
    "def RunLinearModel(daily):\n",
    "    model = smf.ols('ppg ~ years', data=daily)\n",
    "    results = model.fit()\n",
    "    return model, results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what the results look like."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import display\n",
    "\n",
    "for name, daily in dailies.items():\n",
    "    model, results = RunLinearModel(daily)\n",
    "    print(name)\n",
    "    display(results.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's plot the fitted model with the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotFittedValues(model, results, label=''):\n",
    "    \"\"\"Plots original data and fitted values.\n",
    "\n",
    "    model: StatsModel model object\n",
    "    results: StatsModel results object\n",
    "    \"\"\"\n",
    "    years = model.exog[:,1]\n",
    "    values = model.endog\n",
    "    thinkplot.Scatter(years, values, s=15, label=label)\n",
    "    thinkplot.Plot(years, results.fittedvalues, label='model', color='#ff7f00')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following function plots the original data and the fitted curve."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotLinearModel(daily, name):\n",
    "    \"\"\"Plots a linear fit to a sequence of prices, and the residuals.\n",
    "    \n",
    "    daily: DataFrame of daily prices\n",
    "    name: string\n",
    "    \"\"\"\n",
    "    model, results = RunLinearModel(daily)\n",
    "    PlotFittedValues(model, results, label=name)\n",
    "    thinkplot.Config(title='Fitted values',\n",
    "                     xlabel='Years',\n",
    "                     xlim=[-0.1, 3.8],\n",
    "                     ylabel='Price per gram ($)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are results for the high quality category:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = 'high'\n",
    "daily = dailies[name]\n",
    "\n",
    "PlotLinearModel(daily, name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Moving averages\n",
    "\n",
    "As a simple example, I'll show the rolling average of the numbers from 1 to 10."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "array = np.arange(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "With a \"window\" of size 3, we get the average of the previous 3 elements, or nan when there are fewer than 3."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# It looks like this doesn't exist in recent versions of Pandas\n",
    "\n",
    "# pd.rolling_mean(array, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# But Series now provides `rolling`\n",
    "\n",
    "series = pd.Series(array)\n",
    "series.rolling(3).mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following function plots the rolling mean."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotRollingMean(daily, name):\n",
    "    \"\"\"Plots rolling mean.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    \"\"\"\n",
    "    dates = pd.date_range(daily.index.min(), daily.index.max())\n",
    "    reindexed = daily.reindex(dates)\n",
    "\n",
    "    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.2, label=name)\n",
    "    roll_mean = pd.rolling_mean(reindexed.ppg, 30)\n",
    "    thinkplot.Plot(roll_mean, label='rolling mean', color='#ff7f00')\n",
    "    plt.xticks(rotation=30)\n",
    "    thinkplot.Config(ylabel='price per gram ($)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what it looks like for the high quality category."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PlotRollingMean(daily, name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The exponentially-weighted moving average gives more weight to more recent points."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotEWMA(daily, name):\n",
    "    \"\"\"Plots rolling mean.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    \"\"\"\n",
    "    dates = pd.date_range(daily.index.min(), daily.index.max())\n",
    "    reindexed = daily.reindex(dates)\n",
    "\n",
    "    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.2, label=name)\n",
    "    roll_mean = reindexed.ppg.ewm(30).mean()\n",
    "    thinkplot.Plot(roll_mean, label='EWMA', color='#ff7f00')\n",
    "    plt.xticks(rotation=30)\n",
    "    thinkplot.Config(ylabel='price per gram ($)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PlotEWMA(daily, name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use resampling to generate missing values with the right amount of noise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def FillMissing(daily, span=30):\n",
    "    \"\"\"Fills missing values with an exponentially weighted moving average.\n",
    "\n",
    "    Resulting DataFrame has new columns 'ewma' and 'resid'.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    span: window size (sort of) passed to ewma\n",
    "\n",
    "    returns: new DataFrame of daily prices\n",
    "    \"\"\"\n",
    "    dates = pd.date_range(daily.index.min(), daily.index.max())\n",
    "    reindexed = daily.reindex(dates)\n",
    "\n",
    "    ewma = pd.ewma(reindexed.ppg, span=span)\n",
    "\n",
    "    resid = (reindexed.ppg - ewma).dropna()\n",
    "    fake_data = ewma + thinkstats2.Resample(resid, len(reindexed))\n",
    "    reindexed.ppg.fillna(fake_data, inplace=True)\n",
    "\n",
    "    reindexed['ewma'] = ewma\n",
    "    reindexed['resid'] = reindexed.ppg - ewma\n",
    "    return reindexed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotFilled(daily, name):\n",
    "    \"\"\"Plots the EWMA and filled data.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    \"\"\"\n",
    "    filled = FillMissing(daily, span=30)\n",
    "    thinkplot.Scatter(filled.ppg, s=15, alpha=0.2, label=name)\n",
    "    thinkplot.Plot(filled.ewma, label='EWMA', color='#ff7f00')\n",
    "    plt.xticks(rotation=30)\n",
    "    thinkplot.Config(ylabel='Price per gram ($)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what the EWMA model looks like with missing values filled."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PlotFilled(daily, name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Serial correlation\n",
    "\n",
    "The following function computes serial correlation with the given lag."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SerialCorr(series, lag=1):\n",
    "    xs = series[lag:]\n",
    "    ys = series.shift(lag)[lag:]\n",
    "    corr = thinkstats2.Corr(xs, ys)\n",
    "    return corr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before computing correlations, we'll fill missing values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filled_dailies = {}\n",
    "for name, daily in dailies.items():\n",
    "    filled_dailies[name] = FillMissing(daily, span=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are the serial correlations for raw price data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, filled in filled_dailies.items():            \n",
    "    corr = thinkstats2.SerialCorr(filled.ppg, lag=1)\n",
    "    print(name, corr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It's not surprising that there are correlations between consecutive days, because there are obvious trends in the data.\n",
    "\n",
    "It is more interested to see whether there are still correlations after we subtract away the trends."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for name, filled in filled_dailies.items():            \n",
    "    corr = thinkstats2.SerialCorr(filled.resid, lag=1)\n",
    "    print(name, corr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Even if the correlations between consecutive days are weak, there might be correlations across intervals of one week, one month, or one year."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "for lag in [1, 7, 30, 365]:\n",
    "    print(lag, end='\\t')\n",
    "    for name, filled in filled_dailies.items():            \n",
    "        corr = SerialCorr(filled.resid, lag)\n",
    "        print('%.2g' % corr, end='\\t')\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The strongest correlation is a weekly cycle in the medium quality category.\n",
    "\n",
    "## Autocorrelation\n",
    "\n",
    "The autocorrelation function is the serial correlation computed for all lags.\n",
    "\n",
    "We can use it to replicate the results from the previous section."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.tsa.stattools as smtsa\n",
    "\n",
    "filled = filled_dailies['high']\n",
    "acf = smtsa.acf(filled.resid, nlags=365, unbiased=True)\n",
    "print('%0.2g, %.2g, %0.2g, %0.2g, %0.2g' % \n",
    "      (acf[0], acf[1], acf[7], acf[30], acf[365]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get a sense of how much autocorrelation we should expect by chance, we can resample the data (which eliminates any actual autocorrelation) and compute the ACF."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SimulateAutocorrelation(daily, iters=1001, nlags=40):\n",
    "    \"\"\"Resample residuals, compute autocorrelation, and plot percentiles.\n",
    "\n",
    "    daily: DataFrame\n",
    "    iters: number of simulations to run\n",
    "    nlags: maximum lags to compute autocorrelation\n",
    "    \"\"\"\n",
    "    # run simulations\n",
    "    t = []\n",
    "    for _ in range(iters):\n",
    "        filled = FillMissing(daily, span=30)\n",
    "        resid = thinkstats2.Resample(filled.resid)\n",
    "        acf = smtsa.acf(resid, nlags=nlags, unbiased=True)[1:]\n",
    "        t.append(np.abs(acf))\n",
    "\n",
    "    high = thinkstats2.PercentileRows(t, [97.5])[0]\n",
    "    low = -high\n",
    "    lags = range(1, nlags+1)\n",
    "    thinkplot.FillBetween(lags, low, high, alpha=0.2, color='gray')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following function plots the actual autocorrelation for lags up to 40 days.\n",
    "\n",
    "The flag `add_weekly` indicates whether we should add a simulated weekly cycle."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotAutoCorrelation(dailies, nlags=40, add_weekly=False):\n",
    "    \"\"\"Plots autocorrelation functions.\n",
    "\n",
    "    dailies: map from category name to DataFrame of daily prices\n",
    "    nlags: number of lags to compute\n",
    "    add_weekly: boolean, whether to add a simulated weekly pattern\n",
    "    \"\"\"\n",
    "    thinkplot.PrePlot(3)\n",
    "    daily = dailies['high']\n",
    "    SimulateAutocorrelation(daily)\n",
    "\n",
    "    for name, daily in dailies.items():\n",
    "\n",
    "        if add_weekly:\n",
    "            daily = AddWeeklySeasonality(daily)\n",
    "\n",
    "        filled = FillMissing(daily, span=30)\n",
    "\n",
    "        acf = smtsa.acf(filled.resid, nlags=nlags, unbiased=True)\n",
    "        lags = np.arange(len(acf))\n",
    "        thinkplot.Plot(lags[1:], acf[1:], label=name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To show what a strong weekly cycle would look like, we have the option of adding a price increase of 1-2 dollars on Friday and Saturdays."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def AddWeeklySeasonality(daily):\n",
    "    \"\"\"Adds a weekly pattern.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "\n",
    "    returns: new DataFrame of daily prices\n",
    "    \"\"\"\n",
    "    fri_or_sat = (daily.index.dayofweek==4) | (daily.index.dayofweek==5)\n",
    "    fake = daily.copy()\n",
    "    fake.ppg.loc[fri_or_sat] += np.random.uniform(0, 2, fri_or_sat.sum())\n",
    "    return fake"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what the real ACFs look like.  The gray regions indicate the levels we expect by chance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "axis = [0, 41, -0.2, 0.2]\n",
    "\n",
    "PlotAutoCorrelation(dailies, add_weekly=False)\n",
    "thinkplot.Config(axis=axis, \n",
    "                     loc='lower right',\n",
    "                     ylabel='correlation',\n",
    "                     xlabel='lag (day)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what it would look like if there were a weekly cycle."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PlotAutoCorrelation(dailies, add_weekly=True)\n",
    "thinkplot.Config(axis=axis,\n",
    "                 loc='lower right',\n",
    "                 xlabel='lag (days)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prediction\n",
    "\n",
    "The simplest way to generate predictions is to use `statsmodels` to fit a model to the data, then use the `predict` method from the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def GenerateSimplePrediction(results, years):\n",
    "    \"\"\"Generates a simple prediction.\n",
    "\n",
    "    results: results object\n",
    "    years: sequence of times (in years) to make predictions for\n",
    "\n",
    "    returns: sequence of predicted values\n",
    "    \"\"\"\n",
    "    n = len(years)\n",
    "    inter = np.ones(n)\n",
    "    d = dict(Intercept=inter, years=years, years2=years**2)\n",
    "    predict_df = pd.DataFrame(d)\n",
    "    predict = results.predict(predict_df)\n",
    "    return predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotSimplePrediction(results, years):\n",
    "    predict = GenerateSimplePrediction(results, years)\n",
    "\n",
    "    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.2, label=name)\n",
    "    thinkplot.plot(years, predict, color='#ff7f00')\n",
    "    xlim = years[0]-0.1, years[-1]+0.1\n",
    "    thinkplot.Config(title='Predictions',\n",
    "                 xlabel='Years',\n",
    "                 xlim=xlim,\n",
    "                 ylabel='Price per gram ($)',\n",
    "                 loc='upper right')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what the prediction looks like for the high quality category, using the linear model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = 'high'\n",
    "daily = dailies[name]\n",
    "\n",
    "_, results = RunLinearModel(daily)\n",
    "years = np.linspace(0, 5, 101)\n",
    "PlotSimplePrediction(results, years)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When we generate predictions, we want to quatify the uncertainty in the prediction.  We can do that by resampling.  The following function fits a model to the data, computes residuals, then resamples from the residuals to general fake datasets.  It fits the same model to each fake dataset and returns a list of results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SimulateResults(daily, iters=101, func=RunLinearModel):\n",
    "    \"\"\"Run simulations based on resampling residuals.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    iters: number of simulations\n",
    "    func: function that fits a model to the data\n",
    "\n",
    "    returns: list of result objects\n",
    "    \"\"\"\n",
    "    _, results = func(daily)\n",
    "    fake = daily.copy()\n",
    "    \n",
    "    result_seq = []\n",
    "    for _ in range(iters):\n",
    "        fake.ppg = results.fittedvalues + thinkstats2.Resample(results.resid)\n",
    "        _, fake_results = func(fake)\n",
    "        result_seq.append(fake_results)\n",
    "\n",
    "    return result_seq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To generate predictions, we take the list of results fitted to resampled data.  For each model, we use the `predict` method to generate predictions, and return a sequence of predictions.\n",
    "\n",
    "If `add_resid` is true, we add resampled residuals to the predicted values, which generates predictions that include predictive uncertainty (due to random noise) as well as modeling uncertainty (due to random sampling)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def GeneratePredictions(result_seq, years, add_resid=False):\n",
    "    \"\"\"Generates an array of predicted values from a list of model results.\n",
    "\n",
    "    When add_resid is False, predictions represent sampling error only.\n",
    "\n",
    "    When add_resid is True, they also include residual error (which is\n",
    "    more relevant to prediction).\n",
    "    \n",
    "    result_seq: list of model results\n",
    "    years: sequence of times (in years) to make predictions for\n",
    "    add_resid: boolean, whether to add in resampled residuals\n",
    "\n",
    "    returns: sequence of predictions\n",
    "    \"\"\"\n",
    "    n = len(years)\n",
    "    d = dict(Intercept=np.ones(n), years=years, years2=years**2)\n",
    "    predict_df = pd.DataFrame(d)\n",
    "    \n",
    "    predict_seq = []\n",
    "    for fake_results in result_seq:\n",
    "        predict = fake_results.predict(predict_df)\n",
    "        if add_resid:\n",
    "            predict += thinkstats2.Resample(fake_results.resid, n)\n",
    "        predict_seq.append(predict)\n",
    "\n",
    "    return predict_seq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To visualize predictions, I show a darker region that quantifies modeling uncertainty and a lighter region that quantifies predictive uncertainty."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotPredictions(daily, years, iters=101, percent=90, func=RunLinearModel):\n",
    "    \"\"\"Plots predictions.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    years: sequence of times (in years) to make predictions for\n",
    "    iters: number of simulations\n",
    "    percent: what percentile range to show\n",
    "    func: function that fits a model to the data\n",
    "    \"\"\"\n",
    "    result_seq = SimulateResults(daily, iters=iters, func=func)\n",
    "    p = (100 - percent) / 2\n",
    "    percents = p, 100-p\n",
    "\n",
    "    predict_seq = GeneratePredictions(result_seq, years, add_resid=True)\n",
    "    low, high = thinkstats2.PercentileRows(predict_seq, percents)\n",
    "    thinkplot.FillBetween(years, low, high, alpha=0.3, color='gray')\n",
    "\n",
    "    predict_seq = GeneratePredictions(result_seq, years, add_resid=False)\n",
    "    low, high = thinkstats2.PercentileRows(predict_seq, percents)\n",
    "    thinkplot.FillBetween(years, low, high, alpha=0.5, color='gray')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are the results for the high quality category."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "years = np.linspace(0, 5, 101)\n",
    "thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)\n",
    "PlotPredictions(daily, years)\n",
    "xlim = years[0]-0.1, years[-1]+0.1\n",
    "thinkplot.Config(title='Predictions',\n",
    "                   xlabel='Years',\n",
    "                   xlim=xlim,\n",
    "                   ylabel='Price per gram ($)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But there is one more source of uncertainty: how much past data should we use to build the model?\n",
    "\n",
    "The following function generates a sequence of models based on different amounts of past data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SimulateIntervals(daily, iters=101, func=RunLinearModel):\n",
    "    \"\"\"Run simulations based on different subsets of the data.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    iters: number of simulations\n",
    "    func: function that fits a model to the data\n",
    "\n",
    "    returns: list of result objects\n",
    "    \"\"\"\n",
    "    result_seq = []\n",
    "    starts = np.linspace(0, len(daily), iters).astype(int)\n",
    "\n",
    "    for start in starts[:-2]:\n",
    "        subset = daily[start:]\n",
    "        _, results = func(subset)\n",
    "        fake = subset.copy()\n",
    "\n",
    "        for _ in range(iters):\n",
    "            fake.ppg = (results.fittedvalues + \n",
    "                        thinkstats2.Resample(results.resid))\n",
    "            _, fake_results = func(fake)\n",
    "            result_seq.append(fake_results)\n",
    "\n",
    "    return result_seq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And this function plots the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PlotIntervals(daily, years, iters=101, percent=90, func=RunLinearModel):\n",
    "    \"\"\"Plots predictions based on different intervals.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "    years: sequence of times (in years) to make predictions for\n",
    "    iters: number of simulations\n",
    "    percent: what percentile range to show\n",
    "    func: function that fits a model to the data\n",
    "    \"\"\"\n",
    "    result_seq = SimulateIntervals(daily, iters=iters, func=func)\n",
    "    p = (100 - percent) / 2\n",
    "    percents = p, 100-p\n",
    "\n",
    "    predict_seq = GeneratePredictions(result_seq, years, add_resid=True)\n",
    "    low, high = thinkstats2.PercentileRows(predict_seq, percents)\n",
    "    thinkplot.FillBetween(years, low, high, alpha=0.2, color='gray')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's what the high quality category looks like if we take into account uncertainty about how much past data to use."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = 'high'\n",
    "daily = dailies[name]\n",
    "\n",
    "thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)\n",
    "PlotIntervals(daily, years)\n",
    "PlotPredictions(daily, years)\n",
    "xlim = years[0]-0.1, years[-1]+0.1\n",
    "thinkplot.Config(title='Predictions',\n",
    "                 xlabel='Years',\n",
    "                 xlim=xlim,\n",
    "                 ylabel='Price per gram ($)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Exercises"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "**Exercise:**   The linear model I used in this chapter has the obvious drawback that it is linear, and there is no reason to expect prices to change linearly over time. We can add flexibility to the model by adding a quadratic term, as we did in Section 11.3.\n",
    "\n",
    "Use a quadratic model to fit the time series of daily prices, and use the model to generate predictions. You will have to write a version of `RunLinearModel` that runs that quadratic model, but after that you should be able to reuse code from the chapter to generate predictions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "def RunQuadraticModel(daily):\n",
    "    \"\"\"Runs a linear model of prices versus years.\n",
    "\n",
    "    daily: DataFrame of daily prices\n",
    "\n",
    "    returns: model, results\n",
    "    \"\"\"\n",
    "    daily['years2'] = daily.years**2\n",
    "    model = smf.ols('ppg ~ years + years2', data=daily)\n",
    "    results = model.fit()\n",
    "    return model, results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "name = 'high'\n",
    "daily = dailies[name]\n",
    "\n",
    "model, results = RunQuadraticModel(daily)\n",
    "results.summary()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "PlotFittedValues(model, results, label=name)\n",
    "thinkplot.Config(title='Fitted values',\n",
    "                 xlabel='Years',\n",
    "                 xlim=[-0.1, 3.8],\n",
    "                 ylabel='price per gram ($)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "years = np.linspace(0, 5, 101)\n",
    "thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)\n",
    "PlotPredictions(daily, years, func=RunQuadraticModel)\n",
    "thinkplot.Config(title='predictions',\n",
    "                 xlabel='Years',\n",
    "                 xlim=[years[0]-0.1, years[-1]+0.1],\n",
    "                 ylabel='Price per gram ($)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Exercise:** Write a definition for a class named `SerialCorrelationTest` that extends `HypothesisTest` from Section 9.2. It should take a series and a lag as data, compute the serial correlation of the series with the given lag, and then compute the p-value of the observed correlation.\n",
    "\n",
    "Use this class to test whether the serial correlation in raw price data is statistically significant. Also test the residuals of the linear model and (if you did the previous exercise), the quadratic model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "class SerialCorrelationTest(thinkstats2.HypothesisTest):\n",
    "    \"\"\"Tests serial correlations by permutation.\"\"\"\n",
    "\n",
    "    def TestStatistic(self, data):\n",
    "        \"\"\"Computes the test statistic.\n",
    "\n",
    "        data: tuple of xs and ys\n",
    "        \"\"\"\n",
    "        series, lag = data\n",
    "        test_stat = abs(SerialCorr(series, lag))\n",
    "        return test_stat\n",
    "\n",
    "    def RunModel(self):\n",
    "        \"\"\"Run the model of the null hypothesis.\n",
    "\n",
    "        returns: simulated data\n",
    "        \"\"\"\n",
    "        series, lag = self.data\n",
    "        permutation = series.reindex(np.random.permutation(series.index))\n",
    "        return permutation, lag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "# test the correlation between consecutive prices\n",
    "\n",
    "name = 'high'\n",
    "daily = dailies[name]\n",
    "\n",
    "series = daily.ppg\n",
    "test = SerialCorrelationTest((series, 1))\n",
    "pvalue = test.PValue()\n",
    "print(test.actual, pvalue)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "# test for serial correlation in residuals of the linear model\n",
    "\n",
    "_, results = RunLinearModel(daily)\n",
    "series = results.resid\n",
    "test = SerialCorrelationTest((series, 1))\n",
    "pvalue = test.PValue()\n",
    "print(test.actual, pvalue)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Solution\n",
    "\n",
    "# test for serial correlation in residuals of the quadratic model\n",
    "\n",
    "_, results = RunQuadraticModel(daily)\n",
    "series = results.resid\n",
    "test = SerialCorrelationTest((series, 1))\n",
    "pvalue = test.PValue()\n",
    "print(test.actual, pvalue)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Worked example:** There are several ways to extend the EWMA model to generate predictions. One of the simplest is something like this:\n",
    "\n",
    "1. Compute the EWMA of the time series and use the last point as an intercept, `inter`.\n",
    "\n",
    "2. Compute the EWMA of differences between successive elements in the time series and use the last point as a slope, `slope`.\n",
    "\n",
    "3. To predict values at future times, compute `inter + slope * dt`, where `dt` is the difference between the time of the prediction and the time of the last observation.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = 'high'\n",
    "daily = dailies[name]\n",
    "\n",
    "filled = FillMissing(daily)\n",
    "diffs = filled.ppg.diff()\n",
    "\n",
    "thinkplot.plot(diffs)\n",
    "plt.xticks(rotation=30)\n",
    "thinkplot.Config(ylabel='Daily change in price per gram ($)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filled['slope'] = diffs.ewm(span=365).mean()\n",
    "thinkplot.plot(filled.slope[-365:])\n",
    "plt.xticks(rotation=30)\n",
    "thinkplot.Config(ylabel='EWMA of diff ($)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract the last inter and the mean of the last 30 slopes\n",
    "start = filled.index[-1]\n",
    "inter = filled.ewma[-1]\n",
    "slope = filled.slope[-30:].mean()\n",
    "\n",
    "start, inter, slope"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reindex the DataFrame, adding a year to the end\n",
    "dates = pd.date_range(filled.index.min(), \n",
    "                      filled.index.max() + np.timedelta64(365, 'D'))\n",
    "predicted = filled.reindex(dates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate predicted values and add them to the end\n",
    "predicted['date'] = predicted.index\n",
    "one_day = np.timedelta64(1, 'D')\n",
    "predicted['days'] = (predicted.date - start) / one_day\n",
    "predict = inter + slope * predicted.days\n",
    "predicted.ewma.fillna(predict, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the actual values and predictions\n",
    "thinkplot.Scatter(daily.ppg, alpha=0.1, label=name)\n",
    "thinkplot.Plot(predicted.ewma, color='#ff7f00')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As an exercise, run this analysis again for the other quality categories."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
